import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
import jieba
import re
from gensim.models import KeyedVectors
from sklearn.svm import SVC

# 读取数据并进行数据处理
train_data = pd.read_csv(r'C:\Users\iverson\Desktop\train.news.csv')
test_data = pd.read_csv(r'C:\Users\iverson\Desktop\test.feature.csv')

train_data.drop_duplicates(inplace=True)
train_data.fillna('', inplace=True)

titles_train = train_data['Title']

pattern = r'[^\w\s]'

titles_train = titles_train.apply(lambda x: re.sub(pattern, '', x))
titles_train = titles_train.apply(lambda x: re.sub(r' ', '', x))

stop_words = set()
with open(r'C:\Users\iverson\Desktop\stopwords.txt', 'r', encoding='utf-8') as f:
    for line in f:
        stop_words.add(line.strip())

titles_train = titles_train.apply(lambda x: ' '.join([word for word in jieba.cut(x) if word not in stop_words]))

labels = train_data['label']

# 特征提取
word2vec_model = KeyedVectors.load_word2vec_format(r'C:\Users\iverson\Desktop\学习\sgns.wiki.bigram-char', binary=False, encoding='utf-8')


# 将训练集和验证集中的标题文本转换为词向量表示
def get_word2vec_features(titles, word2vec_model):
    features = []
    for title in titles:
        title_vectors = [word2vec_model[word] for word in title.split() if word in word2vec_model]
        if title_vectors:
            title_avg_vector = np.mean(title_vectors, axis=0)
            features.append(title_avg_vector)
        else:
            features.append(np.zeros(word2vec_model.vector_size))  #如果标题中的所有词都不在词向量模型中，使用零向量
    return np.array(features)


word2vec_features_train = get_word2vec_features(titles_train, word2vec_model)

vectorizer = TfidfVectorizer()
X_train = vectorizer.fit_transform(titles_train)

# 将TF-IDF向量和Word2Vec特征拼接在一起
X_train_combined = np.hstack((X_train.toarray(), word2vec_features_train))

# 训练模型
model_combined = SVC(C=5, gamma=0.05, kernel='linear')
model_combined.fit(X_train_combined, labels)

# 在测试集上进行预测
titles_test = test_data['Title']
titles_test = titles_test.apply(lambda x: re.sub(pattern, '', x))
titles_test = titles_test.apply(lambda x: re.sub(r' ', '', x))
titles_test = titles_test.apply(lambda x: ' '.join([word for word in jieba.cut(x) if word not in stop_words]))
word2vec_features_test = get_word2vec_features(titles_test, word2vec_model)
X_test = vectorizer.transform(titles_test)
X_test_combined = np.hstack((X_test.toarray(), word2vec_features_test))
y_pred_test = model_combined.predict(X_test_combined)

# 将预测结果保存到文件
result_df = pd.DataFrame({'id': test_data['id'], 'label': y_pred_test})
result_df.to_csv(r'C:\Users\iverson\Desktop\result.csv', index=False)
